531c86c028e98348e54ce5dc9c28dcfdf23a6d2d,src/core/org/terrier/indexing/TaggedDocument.java,TaggedDocument,getNextTerm,#,267

Before Change


				}
				if (tag_close) {
					//System.err.println("processing close " + tagName);
					if ((_tags.isTagToProcess(tagName) || _tags.isTagToSkip(tagName)) && !tagName.equals("")) {
						processEndOfTag(upperCaseTagName);
						String stackTop = null;
						if (!stk.isEmpty()) {
							stackTop = stk.peek();
							if (_tags.isTagToProcess(stackTop)) {
								inTagToProcess = true;
								inTagToSkip = false;
							} else {
								inTagToProcess = false;
								inTagToSkip = true;
								continue;
							}
						} else {
							inTagToProcess = false;
							inTagToSkip = false;
						}
					}
					if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
						htmlStk.remove(upperCaseTagName);
					}
				}

After Change


		//the string to return as a result at the end of this method.
		String s = null;
		//StringBuilder sw = null;
		String tagName = null;
		boolean endOfTagName;
		//are we in a body of a tag?
		boolean btag = true;
		int ch = 0;
		//while not the end of document, or the end of file, or we are in a tag
		while (btag && ch != -1 && !EOD) {
			//initialise the stringbuffer with the maximum length of a term (heuristic)
			//sw = new StringBuilder(tokenMaximumLength);
			boolean tag_close = false;
			boolean tag_open = false;
			error = false;
			try {
				if (lastChar == '<' || lastChar == '&') {
					ch = lastChar;
					lastChar = -1;
				}
				//If not EOF and ch.isNotALetter and ch.isNotADigit and
				//ch.isNot '<' and ch.isNot '&'
				//CONSUME: whitespace
				//while ((ch < 1 && ch != '<' && ch != '&') || Character.isWhitespace((char)ch))
				while (ch != -1 && (( ch != '<' && ch != '&') && Character.isWhitespace((char)ch)))
				{
					ch = br.read();
					counter++;
					//if ch is '>' (end of tag), then there is an error.
					if (ch == '>')
						error = true;
				}
				
				//IDENTIFIES: start of opening or closing tags
				if (ch == '<') {
					ch = br.read();
					counter++;
					//if it is a closing tag, set tag_f true
					if (ch == '/') {
						ch = br.read();
						counter++;
						tag_close = true;
					} else if (ch == '!') { //else if it is a comment, that is <!
						counter++;
						ch = br.read();
						if (ch == '[')
						{
							counter++;
							//CDATA block, read until another [
							while ((ch = br.read()) != '['  && ch != -1) {
								counter++;
							}
						}
						else
						{	//it is a comment	
							//read until you encounter a '<', or a '>', or the end of file
							while ((ch = br.read()) != '>' && ch != '<' && ch != -1) {
								counter++;
							} 
							counter++;
						}
					} else {
						tag_open = true; //otherwise, it is an opening tag
					}
				}
				
				if (ch == '&' ) {
					//read until an opening or the end of a tag is encountered, or the 
					//end of file, or a space, or a semicolon,
					//which means the end of the escape sequence &xxx;
					while ((ch = br.read()) != '>' && 
							ch != '<' && 
							ch != ' ' && 
							ch != ';' &&
							ch != -1) {
						counter++;
					} 
					counter++;
					 
				}
				
				//if the body of a tag is encountered
				if ((btag = (tag_close || tag_open))) {
					endOfTagName = false;
					//read until the end of file, or the start, or the end 
					//of a tag, and save the content of the tag
					while (ch != -1 && ch != '<' && ch != '>') {
						if (! endOfTagName)
							tagNameSB.append((char)ch);
						ch = br.read();
						counter++;
						if (! endOfTagName && Character.isWhitespace((char)ch)) {
							endOfTagName = true;
							tagName = tagNameSB.toString();
							upperCaseTagName = StringTools.toUpperCase(tagName);
							//System.err.println("Found tag  " + tagName + (tag_open ? " open" : " close") );
							tagNameSB.setLength(0);
						}
					}
					//ch = br.read();counter++;
					if (! endOfTagName)
					{
						tagName = tagNameSB.toString();
						upperCaseTagName = StringTools.toUpperCase(tagName);
						//System.err.println("Found tag " + tagName+ (tag_open ? " open" : " close"));
						tagNameSB.setLength(0);
					}
				} else { //otherwise, if we are not in the body of a tag
					//read text to tokenise
					if (((char)ch) == '>') {
						counter++;
						ch = br.read();
					}
					while (ch != -1 && ch != '<' && ch != '&')
					{
						sw.append((char)ch);
						ch = br.read();
						counter++;
					}
//					while (ch != -1
//							&& (//ch=='&' || 
//								((ch >= 'A') && (ch <= 'Z'))
//							 || ((ch >= 'a') && (ch <= 'z'))
//							 || ((ch >= '0') && (ch <= '9')))) {
//						sw.append((char)ch);
//						ch = br.read();
//						counter++;
//					}
				}
				lastChar = ch;
				s = sw.toString();
				sw.setLength(0);
				if (tagName != null && !tagName.equals(""))
				{				
					if (tag_open) {
						//System.err.println("processing open " + tagName);
						final boolean tagToProcess = _tags.isTagToProcess(tagName);
						if (tagToProcess || _tags.isTagToSkip(tagName)) {
							stk.push(upperCaseTagName);
							if (tagToProcess) {
								inTagToProcess = true;
								inTagToSkip = false;
							} else {
								inTagToSkip = true;
								inTagToProcess = false;
								continue;
							}
						}
						if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
							htmlStk.add(upperCaseTagName);
							inHtmlTagToProcess = true;
						}
					}
					if (tag_close) {
						//System.err.println("processing close " + tagName);
						final boolean tagToProcess = _tags.isTagToProcess(tagName);
						if (tagToProcess || _tags.isTagToSkip(tagName)) {
							processEndOfTag(upperCaseTagName);
							String stackTop = null;
							if (!stk.isEmpty()) {
								stackTop = stk.peek();
								if (_tags.isTagToProcess(stackTop)) {
									inTagToProcess = true;
									inTagToSkip = false;
								} else {
									inTagToProcess = false;
									inTagToSkip = true;
									continue;
								}
							} else {
								inTagToProcess = false;
								inTagToSkip = false;
							}
						}
						if (_fields.isTagToProcess(tagName)) {
							htmlStk.remove(upperCaseTagName);
						}
					}